Clustering

library(tidyverse)

Load data

dir.short <- "data/individual_book_train"
all.files.short <- list.files(dir.short)

df_cl = data.frame(name = character(),
                   mean_BAS = numeric(),
                   var_BAS = numeric(),
                   price_diff = numeric(),
                   stringsAsFactors = FALSE)

for (i in all.files.short) {
  stock = read.csv(file.path(dir.short, i))
  stock = stock |> mutate(BidAskSpread = ask_price1 / bid_price1 - 1)
  
  file_name = gsub("stock_", "", (gsub(".csv", "", i)))
  mean_BAS = mean(stock$BidAskSpread)
  var_BAS = var(stock$BidAskSpread)
  price_diff = mean(abs(stock$ask_price1 - stock$bid_price1))
  
  print(file_name)
  
  df_cl = rbind(df_cl, list(file_name, mean_BAS, var_BAS, price_diff))
}
[1] "0"
[1] "1"
[1] "10"
[1] "100"
[1] "101"
[1] "102"
[1] "103"
[1] "104"
[1] "105"
[1] "107"
[1] "108"
[1] "109"
[1] "11"
[1] "110"
[1] "111"
[1] "112"
[1] "113"
[1] "114"
[1] "115"
[1] "116"
[1] "118"
[1] "119"
[1] "120"
[1] "122"
[1] "123"
[1] "124"
[1] "125"
[1] "126"
[1] "13"
[1] "14"
[1] "15"
[1] "16"
[1] "17"
[1] "18"
[1] "19"
[1] "2"
[1] "20"
[1] "21"
[1] "22"
[1] "23"
[1] "26"
[1] "27"
[1] "28"
[1] "29"
[1] "3"
[1] "30"
[1] "31"
[1] "32"
[1] "33"
[1] "34"
[1] "35"
[1] "36"
[1] "37"
[1] "38"
[1] "39"
[1] "4"
[1] "40"
[1] "41"
[1] "42"
[1] "43"
[1] "44"
[1] "46"
[1] "47"
[1] "48"
[1] "5"
[1] "50"
[1] "51"
[1] "52"
[1] "53"
[1] "55"
[1] "56"
[1] "58"
[1] "59"
[1] "6"
[1] "60"
[1] "61"
[1] "62"
[1] "63"
[1] "64"
[1] "66"
[1] "67"
[1] "68"
[1] "69"
[1] "7"
[1] "70"
[1] "72"
[1] "73"
[1] "74"
[1] "75"
[1] "76"
[1] "77"
[1] "78"
[1] "8"
[1] "80"
[1] "81"
[1] "82"
[1] "83"
[1] "84"
[1] "85"
[1] "86"
[1] "87"
[1] "88"
[1] "89"
[1] "9"
[1] "90"
[1] "93"
[1] "94"
[1] "95"
[1] "96"
[1] "97"
[1] "98"
[1] "99"

Apply clustering

colnames(df_cl) = c("name", "mean_BAS", "var_BAS", "price_diff")

df_cl$name = as.numeric(df_cl$name)
df_cl = df_cl[order(df_cl$name),]
row.names(df_cl) = df_cl$name

df_cl = df_cl |>
    select(-var_BAS)

Find optimal k - skree plot

# Initialize total within sum of squares error: wss
wss <- 0

# For 1 to 15 cluster centers
for (i in 1:15) {
  km.out <- kmeans(df_cl[-1], centers = i, nstart = 20)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out$tot.withinss
}

# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Set k equal to the number of clusters corresponding to the elbow location
k <- 4

Cluster with k = 4

km.out <- kmeans(df_cl[-1], centers = 4, nstart = 20)

df = data.frame(
    names = row.names(df_cl),
    mean_BAS = df_cl$mean_BAS,
    price_diff = df_cl$price_diff,
    cluster = factor(km.out$cluster)
)

plot = ggplot(df, aes(x = mean_BAS, y = price_diff, color = cluster, label = names)) + 
  geom_point() + 
  geom_text(aes(label=names), vjust = -1, hjust = 1) +
  theme_minimal() +
  labs(title = "Cluster Plot", x = "mean_BAS", y = "price_diff")

library(plotly)
ggplotly(plot)